# -*- coding: utf-8 -*-
# @Author: lidongdong
# @time  : 19-1-23 下午10:03
# @file  : plainobj.py

import jieba
import re


class CaptionUnit(object):
    def __init__(self, index, filename=None, captions=[]):
        self.index = index
        self.filename = filename
        self.captions = captions
        self.to_dict()

    def to_dict(self):
        captions = map(lambda x: re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*]+".decode("utf8"), " ".decode("utf8"), x), self.captions)
        self.caption_splits = \
            map(lambda x: filter(lambda x: x != " ", map(lambda z: z.encode("utf-8"), jieba.lcut(x))), captions)


if __name__ == '__main__':

    cu = CaptionUnit(0, filename="abc.jpg", captions=["this is a.", "big ! pig. hello word !!!"])
    print cu.__dict__
